/* Copyright (C) 2000-2002 Lavtech.com corp. All rights reserved.

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <ctype.h>
#include <errno.h>

#include "udm_config.h"
#include "udm_common.h"
#include "udm_stopwords.h"
#include "udm_utils.h"
#include "udm_agent.h"


UDM_STOPWORD * UdmStopListFind(UDM_STOPLIST *List,const char *word){
	int low  = 0;
	int high = List->nstopwords - 1;

	if(!List->StopWord)return(0);
	while (low <= high) {
		int middle = (low + high) / 2;
		int match = strcmp(List->StopWord[middle].word,word);
		if (match < 0)low = middle + 1;
		if (match > 0)high = middle - 1;
		if (match == 0)return(&List->StopWord[middle]);
	}
	return(NULL);
}

static int cmpstop(const void *s1,const void *s2){
	return(strcmp(((const UDM_STOPWORD*)s1)->word,((const UDM_STOPWORD*)s2)->word));
}

void UdmStopListSort(UDM_STOPLIST *List){
	/* Sort stoplist to run binary search later */
	qsort(List->StopWord,List->nstopwords,sizeof(UDM_STOPWORD),cmpstop);
}

int UdmStopListAdd(UDM_STOPLIST *List,UDM_STOPWORD * stopword){
	size_t j;

	/* If the word is already in list     */
	/* We will not add it again           */
	/* But mark it as "international word"*/
	/* i.e. the word without language     */
	/* It will allow to avoid troubles    */
	/* with language guesser              */
	for(j=0;j<List->nstopwords;j++){
		if(!strcmp(List->StopWord[j].word,stopword->word)){
			UDM_FREE(List->StopWord[j].lang);
			List->StopWord[j].lang=strdup("");
			return 0;
		}
	}

	List->StopWord=(UDM_STOPWORD *)realloc(List->StopWord,(List->nstopwords+1)*sizeof(UDM_STOPWORD));
	List->StopWord[List->nstopwords].word=strdup(stopword->word);
	List->StopWord[List->nstopwords].lang=strdup(stopword->lang?stopword->lang:"");
	List->nstopwords++;

	return(1);
}

void UdmStopListFree(UDM_STOPLIST *List){
	size_t i;
	for(i=0;i<List->nstopwords;i++){
		UDM_FREE(List->StopWord[i].word);
		UDM_FREE(List->StopWord[i].lang);
	}
	UDM_FREE(List->StopWord);
	List->nstopwords=0;
}


__INDLIB__ int UdmStopListLoad(UDM_ENV * Conf,const char * stoplist_file_name){
	char fname[1024];
	char str[1024];
	char * lasttok;
	FILE * stopfile;
	UDM_STOPWORD stopword;
	UDM_CHARSET * cs=NULL;
	UDM_CONV cnv;
	char * charset=NULL;

	if (!stoplist_file_name)
	{
		snprintf(fname,sizeof(fname)-1,"%s%c%s",UDM_CONF_DIR,UDMSLASH,"stopwords.txt");
		fname[sizeof(fname)-1]='\0';
	} else
	{
		strncpy(fname,stoplist_file_name,sizeof(fname)-1);
		fname[sizeof(fname)-1]='\0';
	}

	if (!(stopfile=fopen(fname,"r")))
	{
		sprintf(Conf->errstr,"Can't open stopwords file '%s' (%s)", fname, strerror(errno));
		Conf->errcode=1;
		return(1);
	}

	bzero(&stopword,sizeof(stopword));

	while(fgets(str,sizeof(str),stopfile)){
		if(!str[0])continue;
		if(str[0]=='#')continue;
		
		if(!strncmp(str,"Charset:",8)){
			UDM_FREE(charset);
			charset=strtok_r(str+8," \t\n\r",&lasttok);
			if(charset){
				charset=strdup(charset);
			}
		}else
		if(!strncmp(str,"Language:",9)){
			UDM_FREE(stopword.lang);
			stopword.lang=strtok_r(str+9," \t\n\r",&lasttok);
			if(stopword.lang)stopword.lang=strdup(stopword.lang);
		}else
		if((stopword.word=strtok_r(str,"\t\n\r",&lasttok))){
			char lwrd[UDM_MAXWORDSIZE+10];
			
			if(!cs){
				if(!charset){
					sprintf(Conf->errstr,"No charset definition in stopwords file '%s'", fname);
					Conf->errcode=1;
					UDM_FREE(stopword.lang);
					return(1);
				}else{
					cs=UdmGetCharSet(charset);
					if(!cs){
						sprintf(Conf->errstr,"Unknown charset '%s' in stopwords file '%s'", charset,fname);
						Conf->errcode=1;
						UDM_FREE(stopword.lang);
						UDM_FREE(charset);
						return(1);
					}
					UdmConvInit(&cnv,cs,Conf->lcs,UDM_RECODE_HTML);
				}
			}
			
			UdmConv(&cnv,lwrd,sizeof(lwrd)-1,stopword.word,strlen(stopword.word)+1);
			lwrd[sizeof(lwrd)-1]='\0';
			stopword.word=lwrd;
			UdmStopListAdd(&Conf->StopWords,&stopword);
			/*fprintf(stderr,"'%s' '%s' '%s'\n",stopword.word,stopword.lang,charset);*/
		}
	}
	fclose(stopfile);
	UdmStopListSort(&Conf->StopWords);
	UDM_FREE(stopword.lang);
	UDM_FREE(charset);
	return(0);
}
